org_df <- read_excel("wuhan_blood_sample_data_Jan_Feb_2020.xlsx")
df <- org_df %>%
mutate(gender = as.factor(ifelse(gender==1, "male", "female"))) %>%
mutate(outcome = as.factor(ifelse(outcome == 0, "Survived", "Died"))) %>%
rename(admission_time = 'Admission time',
discharge_time = 'Discharge time',
hs_CRP = 'High sensitivity C-reactive protein')
names(df)[34] <- "Tumor necrosis factor alpha"
names(df)[37] <- "Interleukin 1 beta"
names(df)[68] <- "Gamma glutamyl transpeptidase"
The dataset consists of 81 variables and has 6120 observations (blood tests). See the summary below:
summary_df <- df %>% select(outcome, gender)
tbl_summary(
summary_df,
by = outcome,
label = gender ~ "Gender") %>%
modify_header(label ~ "**Variable**") %>%
add_overall() %>%
as_kable() %>% kable_paper("hover")
| Variable | Overall, N = 6,120 | Died, N = 2,905 | Survived, N = 3,215 |
|---|---|---|---|
| Gender | |||
| female | 2,390 (39%) | 751 (26%) | 1,639 (51%) |
| male | 3,730 (61%) | 2,154 (74%) | 1,576 (49%) |
The blood tests were taken from 375 different patients.
df %>% select(PATIENT_ID, gender, outcome) %>%
drop_na(PATIENT_ID) %>%
select(-PATIENT_ID) %>%
tbl_summary(label = gender ~ "Gender", by = outcome) %>%
add_overall() %>%
modify_header(label ~ "**Variable**") %>%
as_kable() %>% kable_paper("hover")
| Variable | Overall, N = 375 | Died, N = 174 | Survived, N = 201 |
|---|---|---|---|
| Gender | |||
| female | 151 (40%) | 48 (28%) | 103 (51%) |
| male | 224 (60%) | 126 (72%) | 98 (49%) |
The dataset was split into two dataframes Patients and Blood tests in order to make data analysis easier.
One additional column was created to store the hospitalization time of all patients - used in further analysis to check the relation between length stay and outcome. Go to section Patients Visualization to see basic visualizations about patients.
patients <- df %>%
select(PATIENT_ID, age, gender, admission_time, discharge_time, outcome) %>%
drop_na(PATIENT_ID) %>%
mutate("hospitalization_length" = round((difftime(discharge_time, admission_time, units = "days") ), digits = 2)) %>%
relocate(hospitalization_length, .after = discharge_time)
head(patients) %>%
kbl() %>%
kable_paper("hover")
| PATIENT_ID | age | gender | admission_time | discharge_time | hospitalization_length | outcome |
|---|---|---|---|---|---|---|
| 1 | 73 | male | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 17.60 days | Survived |
| 2 | 61 | male | 2020-02-04 21:39:03 | 2020-02-19 12:59:01 | 14.64 days | Survived |
| 3 | 70 | female | 2020-01-23 10:59:36 | 2020-02-08 17:52:31 | 16.29 days | Survived |
| 4 | 74 | male | 2020-01-31 23:03:59 | 2020-02-18 12:59:12 | 17.58 days | Survived |
| 5 | 29 | female | 2020-02-01 20:59:54 | 2020-02-18 10:33:06 | 16.56 days | Survived |
| 6 | 81 | female | 2020-01-24 10:47:10 | 2020-02-07 09:06:58 | 13.93 days | Survived |
blood_tests_df <- df %>%
select(-c(admission_time, discharge_time)) %>%
fill(PATIENT_ID)
markers_df <- blood_tests_df %>% select (-c(PATIENT_ID, age, RE_DATE, gender))
tbl_summary(
markers_df,
by = outcome,
missing = "no") %>%
modify_header(label = "**Marker**") %>%
add_n() %>%
bold_labels() %>%
as_kable() %>%
kable_paper("hover") %>%
scroll_box(width = "100%", height = "200px")
| Marker | N | Died, N = 2,905 | Survived, N = 3,215 |
|---|---|---|---|
| Hypersensitive cardiac troponinI | 507 | 70 (18, 631) | 3 (2, 7) |
| hemoglobin | 975 | 123 (110, 135) | 127 (116, 138) |
| Serum chloride | 975 | 104 (100, 111) | 101 (99, 103) |
| Prothrombin time | 662 | 16.3 (15.0, 18.2) | 13.6 (13.1, 14.1) |
| procalcitonin | 459 | 0.38 (0.14, 1.13) | 0.04 (0.02, 0.06) |
| eosinophils(%) | 957 | 0.00 (0.00, 0.10) | 0.70 (0.00, 1.80) |
| Interleukin 2 receptor | 268 | 1,180 (807, 1,603) | 529 (400, 742) |
| Alkaline phosphatase | 930 | 83 (64, 123) | 60 (50, 75) |
| albumin | 934 | 28 (24, 31) | 36 (34, 39) |
| basophil(%) | 957 | 0.10 (0.10, 0.20) | 0.20 (0.10, 0.40) |
| Interleukin 10 | 267 | 11 (6, 17) | 5 (5, 8) |
| Total bilirubin | 930 | 14 (10, 25) | 8 (6, 12) |
| Platelet count | 957 | 112 (55, 174) | 229 (176, 290) |
| monocytes(%) | 958 | 3.0 (2.0, 4.7) | 8.2 (6.3, 10.0) |
| antithrombin | 330 | 80 (70, 92) | 93 (86, 103) |
| Interleukin 8 | 268 | 30 (18, 61) | 11 (7, 19) |
| indirect bilirubin | 906 | 6.2 (4.2, 9.2) | 4.9 (3.4, 7.1) |
| Red blood cell distribution width | 923 | 13.20 (12.40, 14.40) | 12.20 (11.80, 12.80) |
| neutrophils(%) | 957 | 92 (88, 95) | 66 (56, 76) |
| total protein | 931 | 62 (57, 68) | 68 (65, 72) |
| Quantification of Treponema pallidum antibodies | 279 | 0.06 (0.04, 0.07) | 0.05 (0.04, 0.07) |
| Prothrombin activity | 659 | 66 (56, 78) | 94 (88, 103) |
| HBsAg | 279 | 0.01 (0.00, 0.02) | 0.00 (0.00, 0.01) |
| mean corpuscular volume | 957 | 91.3 (87.1, 96.4) | 89.8 (86.8, 91.9) |
| hematocrit | 957 | 35.9 (32.5, 39.8) | 37.1 (34.3, 39.9) |
| White blood cell count | 1,127 | 12 (8, 17) | 6 (4, 8) |
| Tumor necrosis factor alpha | 268 | 11 (8, 17) | 8 (6, 10) |
| mean corpuscular hemoglobin concentration | 957 | 342 (331, 350) | 343 (335, 350) |
| fibrinogen | 566 | 3.92 (2.44, 5.63) | 4.40 (3.56, 5.34) |
| Interleukin 1 beta | 268 | 5.0 (5.0, 5.0) | 5.0 (5.0, 5.0) |
| Urea | 936 | 11 (7, 17) | 4 (3, 5) |
| lymphocyte count | 957 | 0.46 (0.31, 0.69) | 1.25 (0.87, 1.62) |
| PH value | 384 | 6.50 (6.00, 7.41) | 6.50 (6.00, 7.00) |
| Red blood cell count | 1,127 | 4.0 (3.6, 4.6) | 4.2 (3.8, 4.7) |
| Eosinophil count | 957 | 0.00 (0.00, 0.01) | 0.03 (0.00, 0.09) |
| Corrected calcium | 914 | 2.35 (2.27, 2.44) | 2.37 (2.27, 2.44) |
| Serum potassium | 980 | 4.60 (4.04, 5.27) | 4.28 (3.92, 4.62) |
| glucose | 775 | 9.1 (6.9, 13.3) | 5.7 (5.0, 7.6) |
| neutrophils count | 957 | 10.8 (7.0, 15.2) | 3.5 (2.4, 5.2) |
| Direct bilirubin | 930 | 8 (5, 14) | 4 (2, 5) |
| Mean platelet volume | 862 | 11.30 (10.70, 12.20) | 10.40 (9.90, 11.00) |
| ferritin | 283 | 1,636 (928, 2,517) | 504 (235, 834) |
| RBC distribution width SD | 923 | 43.7 (39.9, 48.5) | 39.5 (37.6, 41.4) |
| Thrombin time | 566 | 17.30 (15.80, 19.75) | 16.40 (15.60, 17.30) |
| (%)lymphocyte | 958 | 4 (2, 7) | 24 (16, 33) |
| HCV antibody quantification | 279 | 0.07 (0.04, 0.11) | 0.06 (0.04, 0.08) |
| D-D dimer | 630 | 19 (3, 21) | 1 (0, 1) |
| Total cholesterol | 931 | 3.32 (2.72, 3.88) | 3.93 (3.39, 4.48) |
| aspartate aminotransferase | 935 | 38 (25, 59) | 21 (17, 29) |
| Uric acid | 934 | 245 (166, 374) | 240 (193, 304) |
| HCO3- | 934 | 21.8 (18.8, 24.7) | 24.7 (22.8, 26.7) |
| calcium | 979 | 2.00 (1.90, 2.08) | 2.17 (2.10, 2.25) |
| Amino-terminal brain natriuretic peptide precursor(NT-proBNP) | 475 | 1,467 (516, 4,578) | 64 (23, 166) |
| Lactate dehydrogenase | 934 | 593 (431, 840) | 220 (189, 278) |
| platelet large cell ratio | 862 | 35 (30, 42) | 28 (23, 33) |
| Interleukin 6 | 272 | 66 (30, 142) | 8 (2, 21) |
| Fibrin degradation products | 330 | 114 (18, 150) | 4 (4, 4) |
| monocytes count | 957 | 0.36 (0.20, 0.58) | 0.43 (0.32, 0.58) |
| PLT distribution width | 862 | 13.60 (12.10, 15.93) | 11.70 (10.70, 13.00) |
| globulin | 930 | 34.1 (30.2, 38.2) | 31.8 (29.5, 35.2) |
| Gamma glutamyl transpeptidase | 930 | 42 (27, 79) | 29 (19, 46) |
| International standard ratio | 659 | 1.31 (1.17, 1.48) | 1.04 (0.99, 1.09) |
| basophil count(#) | 957 | 0.010 (0.010, 0.030) | 0.010 (0.010, 0.020) |
| 2019-nCoV nucleic acid detection | 501 | ||
| -1 | 57 (100%) | 444 (100%) | |
| mean corpuscular hemoglobin | 957 | 31.20 (29.90, 32.70) | 30.70 (29.60, 31.90) |
| Activation of partial thromboplastin time | 568 | 40 (36, 45) | 39 (35, 43) |
| hs_CRP | 737 | 114 (65, 191) | 7 (2, 35) |
| HIV antibody quantification | 278 | 0.08 (0.07, 0.11) | 0.09 (0.08, 0.11) |
| serum sodium | 975 | 142 (138, 148) | 140 (138, 141) |
| thrombocytocrit | 862 | 0.15 (0.10, 0.21) | 0.24 (0.19, 0.30) |
| ESR | 383 | 36 (16, 59) | 26 (13, 40) |
| glutamic-pyruvic transaminase | 931 | 26 (18, 44) | 21 (15, 36) |
| eGFR | 936 | 72 (43, 91) | 100 (85, 114) |
| creatinine | 936 | 88 (68, 130) | 64 (54, 83) |
ml_df <- blood_tests_df %>%
select(-RE_DATE) %>%
group_by(PATIENT_ID) %>%
summarise(across(everything(), function(x) last(na.omit(x)))) %>%
na_mean(option = "median") %>%
select(-PATIENT_ID)
View(ml_df)
ggplot(patients, aes(x = gender, fill = gender)) +
geom_bar() +
labs(y = "Number of patients",
x = "Gender")
patients_hist <- ggplot(patients, aes(x = age, fill = gender)) +
geom_histogram(stat = "count",
binwidth = 1.2)+
labs(y = "Number of patients",
x = "Age") +
scale_x_continuous(breaks=seq(20, 100, 5))
ggplotly(patients_hist)
layout_ggplotly <- function(gg, x = -0.05, y = -0.05){
# The 1 and 2 goes into the list that contains the options for the x and y axis labels respectively
gg[['y']][['layout']][['annotations']][[1]][['y']] <- x
gg[['y']][['layout']][['annotations']][[2]][['x']] <- y
gg
}
patients_outcome <- ggplot(patients, aes(x = age, fill = outcome)) +
geom_histogram(binwidth = 1.2) +
facet_grid(~ gender) +
scale_y_continuous(breaks=seq(0, 20, 2)) +
scale_x_continuous(breaks=seq(20, 100, 5)) +
labs(y = "Number of patients", x = "Age")
ggplotly(patients_outcome)
hospitalization_length_plot <- ggplot(patients, aes(x = hospitalization_length, fill = outcome)) +
geom_histogram(binwidth = 1.2) +
facet_grid(outcome ~ gender) +
scale_y_continuous(breaks=seq(0, 20, 2)) +
scale_x_continuous(breaks=seq(0, 40, 5)) +
labs(y = "Number of patients",
x = "Hospitalization length [days]")
ggplotly(hospitalization_length_plot)
outcome_per_day <- patients %>%
mutate(discharge_time = as.Date(discharge_time)) %>%
filter(outcome == "Died")
outcome_per_day_plot <- ggplot(outcome_per_day, aes(x = discharge_time, fill = outcome)) +
geom_histogram(binwidth = 1.2) +
facet_grid(~ gender) +
labs(x = "Discharge date", y = "Number of deaths") +
theme(legend.position = "none")
ggplotly(outcome_per_day_plot)
outcome_during_day_plot <- patients %>%
mutate(time_h_m = hms(format(patients$discharge_time, format = "%H:%M:%S"))) %>%
mutate(time_h_m = (hour(time_h_m) + minute(time_h_m)/60)) %>%
filter(outcome == "Died") %>%
ggplot(aes(x = time_h_m, fill = "blue")) +
geom_histogram(binwidth = 1.2) +
scale_x_continuous(breaks = seq(0, 24, by = 1)) +
labs(x = "Number of dead cases", y = "Time of the day")+
theme(legend.position = "none")
ggplotly(outcome_during_day_plot)
# Variables correlation Preparing the dataset for correlation (changing factor variables to numeric).
cor_df <- df %>%
select(-c(PATIENT_ID, RE_DATE, admission_time, discharge_time)) %>%
mutate(outcome = ifelse(df$outcome == "Died", 1, 0)) %>%
mutate(gender = ifelse(df$gender == "male", 1, 0)) %>%
rename(male = gender)
correlationMatrix <- correlate(cor_df[sapply(cor_df, is.numeric)], use='pairwise.complete.obs')
From the previous analysis, it is known that elderly people are more susceptible to die due to Cocvid-19. Below short summary, what biomarkers are highly correlated with age.
age_correlation <- correlationMatrix %>%
focus(age) %>%
mutate(age = abs(age)) %>%
arrange(desc(age)) %>%
filter(rowname != "outcome") %>% head
age_correlation %>% kbl() %>% kable_paper("hover")
| rowname | age |
|---|---|
| eGFR | 0.6295013 |
| (%)lymphocyte | 0.5468574 |
| neutrophils(%) | 0.5216471 |
| albumin | 0.5077106 |
| Urea | 0.4233471 |
| hs_CRP | 0.4007634 |
The most correlated is eGFR which is used to measure the the effectiveness of the work of the kidneys. Its hard to present a norm value, because this marker depends on many factors like gender, age, body mass, but some sources show that value above 90 is proper. Too low ,and too high value of GFR in some cases indicates kidney diseases which affect the blood filtration.
Below a chart presents the GFR value between patients in different age, grouped by outcome. It’s analysis shows, that many elderly patients that died, had some abnormalities in the work of the kidneys.
ggplot(ml_df, aes(x = age, y = `eGFR`, color = outcome)) +
geom_point() +
theme(legend.position = c(0.9,0.9)) +
ylim(0 , 150)
The next two high correlated biomarkers are related to immune system. The values of lymphocyte and neutrophils show how strong the organism is and how well it fights with the disease.
Lymphocytes are cells responsible for protecting our body (by creating anitbodies) from viruses, bacteria and other disease causing factors. The norm value for an adult is between 15 - 40%. Lower lymphocytes levels means, that the body cannot fight the disease. The left chart below confirms, that elderly people have weaker immune system and it’s hard for their organism to fight the disease.
plot1 <- ggplot(ml_df, aes(x = age, y = `(%)lymphocyte`, color = outcome)) + geom_point() + theme(legend.position = "none")
plot2 <- ggplot(ml_df, aes(x = `lymphocyte count`, y = `(%)lymphocyte`, color = outcome)) +
geom_point() +
theme(legend.position = c(0.8, 0.2)) +
xlim(0,3.75)
grid.arrange(plot1, plot2, ncol=2)
Neutrophils are essential part of immune system - this cells search for pathogens in organisms and destroy them. High value of neutrophils(%) results in many neutrophil cells in blood (right plot below), which means that a medical condition occurs in patients body and that the immune system fights it.
This correlation explains that elderly people are more vulnerable, and their immune systems need to produce more neutrophils to fight the pathogens than in younger patients. The left plot shows that some of the tested patients had some medical condition, due to increased amount of neutrophils. Adding the information about the outcome, confirms that elderly patients are more likely to die because of Covid-19.
plot1 <- ggplot(ml_df, aes(x = age, y = `neutrophils(%)`, color = outcome)) + geom_point() + theme(legend.position = "none")
plot2 <- ggplot(ml_df, aes(x = `neutrophils count`, y = `neutrophils(%)`, color = outcome)) + geom_point() + theme(legend.position = c(0.8, 0.2))
grid.arrange(plot1, plot2, ncol=2)
The following section is devoted to check the correlation between biomarkers and the outcome.
The correlation matrix for the highest correlated variables and the numeric correlation values are shown below.
'%ni%' <- Negate('%in%')
outcome_cor <- correlationMatrix %>%
focus(outcome) %>%
mutate(outcome = abs(outcome)) %>%
arrange(desc(outcome)) %>%
filter(`rowname` %ni% c('neutrophils(%)', 'neutrophils count')) %>%
mutate(outcome = round(outcome,2)) %>%
head
outcome_corr_df <- cor_df %>% select(c(outcome_cor$rowname, outcome))
outcome_cor_matrix <- cor(outcome_corr_df[sapply(outcome_corr_df, is.numeric)], use='pairwise.complete.obs')
corrplot(outcome_cor_matrix)
The previous sections contains the analysis about lymphocytes and how important they are when fighting with the disease, that’s why they won’t be considered in this section.
outcome_cor %>% kbl() %>% kable_paper("hover")
| rowname | outcome |
|---|---|
| (%)lymphocyte | 0.72 |
| albumin | 0.69 |
| Prothrombin activity | 0.66 |
| hs_CRP | 0.65 |
| D-D dimer | 0.64 |
| Lactate dehydrogenase | 0.62 |
Below in each tab are presented the values of each biomarkers for all the patients grouped by age and outcome. Analysis of theses data shows, that all the biomarkers are also somehow correlated with the age, because the biomarkers values for eldery are very often (in this 5 biomarkers) outstanding from the values for people less than 50 years. This statement is confirmed by the boxplots below every chart, preseting the distribution of the biomarkers grouped by age group (adult - less than 64 years, eldery - more than 64 years), gender and outcome.
layout_ggplotly <- function(gg, x = -0.02, y = -0.05){
# The 1 and 2 goes into the list that contains the options for the x and y axis labels respectively
gg[['x']][['layout']][['annotations']][[1]][['y']] <- x
gg[['x']][['layout']][['annotations']][[2]][['x']] <- y
gg
}
ggplot(ml_df, aes(x = age, y = `albumin`, color = outcome)) + geom_point()
albumin_plot <- ml_df %>%
mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>%
ggplot(aes(x= age_group, y = `albumin`, fill = gender)) +
geom_boxplot(na.rm=TRUE) + facet_grid(~outcome) +
labs(x = "Age group", y = "Albumin")
ggplotly(albumin_plot) %>% layout(boxmode = "group") %>% layout_ggplotly
ggplot(ml_df, aes(x = age, y = `Prothrombin activity`, color = outcome)) + geom_point()
pt_plot <- ml_df %>%
mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>%
ggplot(aes(x= age_group, y = `Prothrombin activity`, fill = gender)) +
geom_boxplot(na.rm=TRUE) + facet_grid(~outcome) +
labs(x = "Age group", y = "Prothrombin activity")
ggplotly(pt_plot) %>% layout(boxmode = "group") %>% layout_ggplotly
A norm value for hs-CRP is about 50. All values above that level indicate some kind of inflammation in the body. Many values on the first plot are much more above the norm level showing very strong inflammation which eventually (probably) contributed to the death.
ggplot(ml_df, aes(x = age, y = hs_CRP, color = outcome)) + geom_point()
crp_plot <- ml_df %>%
mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>%
ggplot(aes(x= age_group, y = hs_CRP, fill = gender)) +
geom_boxplot(na.rm=TRUE) + facet_grid(~outcome) +
labs(x = "Age group", y = "High sensitivity C-reactive protein")
ggplotly(crp_plot) %>% layout(boxmode = "group") %>% layout_ggplotly
D-diemrs are cells responsible for decomposition of a clot. Their high value mean that there was a blood clot in the organism. Sometimes it can be linked with myocardial infarction, pulmonary embolism which combined with Covid-19 symptoms can lead to death.
ggplot(ml_df, aes(x = age, y = `D-D dimer`, color = outcome)) + geom_point()
dimer_plot <- ml_df %>%
mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>%
ggplot(aes(x= age_group, y = `D-D dimer`, fill = gender)) +
geom_boxplot(na.rm=TRUE) + facet_grid(~outcome) +
labs(x = "Age group", y = "D-D dimer")
ggplotly(dimer_plot) %>% layout(boxmode = "group") %>% layout_ggplotly
ggplot(ml_df, aes(x = age, y = `Lactate dehydrogenase`, color = outcome)) + geom_point()
ldh_plot <- ml_df %>%
mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>%
ggplot(aes(x= age_group, y = `Lactate dehydrogenase`, fill = gender)) +
geom_boxplot(na.rm=TRUE) + facet_grid(~outcome) +
labs(x = "Age group", y = "Lactate dehydrogenase")
ggplotly(ldh_plot) %>% layout(boxmode = "group") %>% layout_ggplotly
patients_agg <- patients %>% select(c(discharge_time, outcome)) %>%
mutate(discharge_time = as.Date(patients$discharge_time, "%m/%d/%Y" )) %>%
filter(outcome == 'Died') %>%
group_by(discharge_time) %>%
summarise(deaths_count = n(), .groups="drop") %>%
arrange(discharge_time) %>%
mutate(deaths_count_agg = cumsum(deaths_count))
ggplot(patients_agg, aes(x = discharge_time, y = deaths_count_agg)) +
geom_line(size = 1.1, color = 'red') +
transition_reveal(discharge_time) +
labs(x = "Discharde time", y = "Deaths count aggregate") +
scale_x_continuous(breaks = seq(min(patients_agg$discharge_time), max(patients_agg$discharge_time), 10))
In this chapter 3 classification models are trained to predict the outcome (death/survival) of COVID-19 sick patients based on basic patients observations and blood test samples. One blood test for each patient is considered as an observation for the machine learning algorithms. Extra data pre processing was needed to prepare the dataset. For each patient, all the blood test are reduced to one row, containing the closest value to the discharge time. Missing values for some variables were replaced with median, due to very unsymmetrical distribution.
Redundant columns like patient id, blood test time and admission and discharge time were removed from the dataset.
The preprocessed data is split into two datasets training and testing.
set.seed(23)
rows <- sample(nrow(ml_df))
ml_df <- ml_df[rows,]
set.seed(23)
inTraining <- createDataPartition(y = ml_df$outcome, p=.70, list=FALSE)
training <- ml_df[inTraining,]
testing <- ml_df[-inTraining,]
The first dead patient in the original dataset was on 220 place, so the below check is done to be sure that training and testing sets for the model have similar output class distribution.
training %>% select(gender, outcome) %>% tbl_summary(by = outcome)
| Characteristic | Died, N = 1221 | Survived, N = 1411 |
|---|---|---|
| gender | ||
| female | 39 (32%) | 73 (52%) |
| male | 83 (68%) | 68 (48%) |
|
1
Statistics presented: n (%)
|
||
testing %>% select(gender, outcome) %>% tbl_summary(by = outcome)
| Characteristic | Died, N = 521 | Survived, N = 601 |
|---|---|---|
| gender | ||
| female | 9 (17%) | 30 (50%) |
| male | 43 (83%) | 30 (50%) |
|
1
Statistics presented: n (%)
|
||
For the learning process Repeated Cross-Validation was used. The training process will be repeated 5 times, and each time the training set will be split to training and validation sets.
ctrl <- trainControl(method="repeatedcv",
number = 10,
repeats = 3)
For reproducibility.
set.seed(23)
Standarization of values for SVM Polynominal Classifier.
set.seed(23)
preProcValues <- preProcess(training, method = c("center", "scale"))
trainTransformed <- predict(preProcValues, training)
testTransformed <- predict(preProcValues, testing)
dataset <- rbind(trainTransformed, testTransformed)
Creating the classifier with training dataset.
set.seed(23)
svm_fit <- train(outcome ~ .,
data = trainTransformed,
method = "svmPoly",
trControl = ctrl,
tuneGrid = data.frame(degree = 1,scale = 1, C = 1))
svm_classes <- predict(svm_fit, newdata = testTransformed)
svm_classes_prob <- predict(svm_fit, newdata = testTransformed, type = "prob")
confusionMatrix(data = svm_classes, testTransformed$outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Died Survived
## Died 47 7
## Survived 5 53
##
## Accuracy : 0.8929
## 95% CI : (0.8203, 0.9434)
## No Information Rate : 0.5357
## P-Value [Acc > NIR] : 4.008e-16
##
## Kappa : 0.7852
##
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.9038
## Specificity : 0.8833
## Pos Pred Value : 0.8704
## Neg Pred Value : 0.9138
## Prevalence : 0.4643
## Detection Rate : 0.4196
## Detection Prevalence : 0.4821
## Balanced Accuracy : 0.8936
##
## 'Positive' Class : Died
##
learning_curve_dat <- function(data,
test_prob = 0.3,
proportion = (1:10)/10,
train_method,
ctrl = NULL,
# train parameters
...)
{
proportion <- sort(unique(proportion))
dataframe_size <- length(proportion) * 2
n_size <- length(proportion)
set.seed(23)
inTraining <- createDataPartition(data$outcome, p = 1 - test_prob, list = FALSE)
training <- data[inTraining,]
testing <- data[-inTraining,]
training_data_size <- nrow(training)
learning_error_df <- data.frame(training_size = integer(dataframe_size),
data = character(dataframe_size),
error_value = numeric(dataframe_size))
index <- 1
for(i in seq(along = proportion)) {
training_set <- training[1:floor((training_data_size * proportion[i])),]
training_set_size <- nrow(training_set)
set.seed(23)
model <- train(outcome ~ .,
data = training_set,
method = train_method,
trControl = ctrl,
...)
# Training Data Error Value
learning_error_df$training_size[index] <- training_set_size
learning_error_df$data[index] <- "Training"
learning_error_df$error_value[index] <- rmse(training_set$outcome, predict(model, newdata = training_set))
# Testing Data Error Value
learning_error_df$training_size[index + 1] <- training_set_size
learning_error_df$data[index + 1] <- "Testing"
learning_error_df$error_value[index + 1] <- rmse(testing$outcome, predict(model, newdata = testing))
index <- index + 2
}
learning_error_df
}
# svm_learninig_curve <- learning_curve_dat(dataset,
# test_prob = 0.3,
# train_method = "svmPoly",
# ctrl = ctrl,
# tuneGrid = data.frame(degree = 1,scale = 1, C = 1))
#
# ggplot(svm_learninig_curve, aes(x = training_size)) +
# geom_line(aes(y = error_value, color = data), size=2.0) +
# theme_bw() +
# labs(title = "Learning Curves",
# x = "Training set size",
# y = "RMSE value")
# svm_ROC <- roc(response = testing$outcome,
# predictor = svm_classes_prob[, "Died"],
# levels = rev(levels(testing$outcome)),
# plot = TRUE,
# auc = TRUE,
# print.auc = TRUE)
set.seed(23)
nb_fit <- train(outcome ~ .,
data = training,
method = "naive_bayes",
trControl = ctrl)
nb_classes <- predict(nb_fit, newdata = testing)
nb_classes_prob <- predict(nb_fit, newdata = testing, type = "prob")
confusionMatrix(data = nb_classes, testing$outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Died Survived
## Died 48 3
## Survived 4 57
##
## Accuracy : 0.9375
## 95% CI : (0.8755, 0.9745)
## No Information Rate : 0.5357
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8742
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9231
## Specificity : 0.9500
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.9344
## Prevalence : 0.4643
## Detection Rate : 0.4286
## Detection Prevalence : 0.4554
## Balanced Accuracy : 0.9365
##
## 'Positive' Class : Died
##
nb_ROC <- roc(response = testing$outcome,
predictor = nb_classes_prob[, "Died"],
levels = rev(levels(testing$outcome)),
plot = TRUE,
auc = TRUE,
print.auc = TRUE)
nb_ROC
##
## Call:
## roc.default(response = testing$outcome, predictor = nb_classes_prob[, "Died"], levels = rev(levels(testing$outcome)), auc = TRUE, plot = TRUE, print.auc = TRUE)
##
## Data: nb_classes_prob[, "Died"] in 60 controls (testing$outcome Survived) < 52 cases (testing$outcome Died).
## Area under the curve: 0.9744
set.seed(17)
fit <- train(outcome ~ ., data = training, method = "rf", trControl = ctrl, ntree=10)
rfClasses <- predict(fit, newdata = testing)
confusionMatrix(data =rfClasses, testing$outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Died Survived
## Died 50 7
## Survived 2 53
##
## Accuracy : 0.9196
## 95% CI : (0.8529, 0.9626)
## No Information Rate : 0.5357
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8395
##
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.9615
## Specificity : 0.8833
## Pos Pred Value : 0.8772
## Neg Pred Value : 0.9636
## Prevalence : 0.4643
## Detection Rate : 0.4464
## Detection Prevalence : 0.5089
## Balanced Accuracy : 0.9224
##
## 'Positive' Class : Died
##